{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "import dask.dataframe as dd\n",
    "import dask.distributed\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from matplotlib.colors import SymLogNorm as symlog\n",
    "from matplotlib import rcParams\n",
    "\n",
    "import sklearn\n",
    "import matplotlib.pyplot as plt\n",
    "import palettable\n",
    "\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pd.options.display.max_rows = 100\n",
    "pd.options.display.max_columns = 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "trips = spark.read.parquet('/data/all_trips_spark.parquet/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pyspark.sql.functions as fn\n",
    "import pyspark.sql.types as ty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "trips2 = trips \\\n",
    "    .withColumn('pickup_datetime', fn.to_utc_timestamp(trips.pickup_datetime, 'America/New_York')) \\\n",
    "    .withColumn('dropoff_datetime', fn.to_utc_timestamp(trips.dropoff_datetime, 'America/New_York')) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "trips2 = trips2.withColumn('pickup_dt_group', fn.concat(\n",
    "        fn.date_format(trips2.pickup_datetime, 'yyyy-MM-dd hh:'),\n",
    "        fn.format_string('%02d:00', (6.0*fn.floor(fn.minute(trips2.pickup_datetime)/6.0)).cast(ty.IntegerType()))\n",
    "                                                       )\n",
    "                          )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Row(dropoff_datetime=datetime.datetime(2016, 6, 18, 21, 30, 7), dropoff_latitude=40.730159759521484, dropoff_longitude=-73.9839096069336, dropoff_taxizone_id=79, ehail_fee=None, extra=0.5, fare_amount=6.5, improvement_surcharge=0.30000001192092896, mta_tax=0.5, passenger_count=None, payment_type='\"1\"', pickup_datetime=datetime.datetime(2016, 6, 18, 21, 22, 52), pickup_latitude=40.732765197753906, pickup_longitude=-73.99752807617188, pickup_taxizone_id=113, rate_code_id=None, store_and_fwd_flag='\"N\"', tip_amount=1.0, tolls_amount=0.0, total_amount=8.800000190734863, trip_distance=0.800000011920929, trip_type='\"yellow\"', vendor_id='\"1\"', trip_id=6511170420736, pickup_dt_group='2016-06-18 09:18:00')"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trips2.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DataFrame[key: string, value: string]"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sqlContext.sql(\"set spark.sql.shuffle.partitions=1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "trips2.groupBy('pickup_taxizone_id', 'pickup_dt_group').count().repartition(263, 'pickup_taxizone_id').write.parquet('/data/trips_timeseries_counts.parquet', mode='overwrite', compression='SNAPPY')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "d3 = dd.read_parquet('/tmp/junk.parquet/pickup_taxizone_id=1', engine='arrow')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "d4 = pd.DataFrame(index=pd.date_range('2009-01-01 00:00:00', '2017-01-01 00:00:00', freq='6T'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "d3 = d3.compute()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "d3 = d3.set_index('pickup_dt_group', drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['2009-01-01 01:00:00', '2009-01-01 01:36:00', '2009-01-01 01:48:00',\n",
       "       '2009-01-01 02:36:00', '2009-01-01 02:48:00', '2009-01-01 03:42:00',\n",
       "       '2009-01-01 03:48:00', '2009-01-01 04:06:00', '2009-01-01 04:12:00',\n",
       "       '2009-01-01 04:18:00',\n",
       "       ...\n",
       "       '2016-12-30 10:36:00', '2016-12-30 11:48:00', '2016-12-31 02:30:00',\n",
       "       '2016-12-31 04:54:00', '2016-12-31 06:30:00', '2016-12-31 09:00:00',\n",
       "       '2016-12-31 09:48:00', '2016-12-31 10:36:00', '2016-12-31 11:00:00',\n",
       "       '2016-12-31 12:12:00'],\n",
       "      dtype='object', name='pickup_dt_group', length=98275)"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d3.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "d4.index = d4.index.astype(str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "d5 = d4.merge(d3, how='left', left_index=True, right_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2009-01-01 00:00:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 00:06:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 00:12:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 00:18:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 00:24:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 00:30:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 00:36:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 00:42:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 00:48:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 00:54:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 01:00:00</th>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 01:06:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 01:12:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 01:18:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 01:24:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 01:30:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 01:36:00</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 01:42:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 01:48:00</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 01:54:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 02:00:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 02:06:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 02:12:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 02:18:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 02:24:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 02:30:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 02:36:00</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 02:42:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 02:48:00</th>\n",
       "      <td>2.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 02:54:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 03:00:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 03:06:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 03:12:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 03:18:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 03:24:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 03:30:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 03:36:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 03:42:00</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 03:48:00</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 03:54:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 04:00:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 04:06:00</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 04:12:00</th>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 04:18:00</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 04:24:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 04:30:00</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 04:36:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 04:42:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 04:48:00</th>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2009-01-01 04:54:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 19:06:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 19:12:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 19:18:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 19:24:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 19:30:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 19:36:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 19:42:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 19:48:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 19:54:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 20:00:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 20:06:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 20:12:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 20:18:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 20:24:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 20:30:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 20:36:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 20:42:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 20:48:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 20:54:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 21:00:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 21:06:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 21:12:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 21:18:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 21:24:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 21:30:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 21:36:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 21:42:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 21:48:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 21:54:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 22:00:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 22:06:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 22:12:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 22:18:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 22:24:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 22:30:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 22:36:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 22:42:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 22:48:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 22:54:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 23:00:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 23:06:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 23:12:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 23:18:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 23:24:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 23:30:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 23:36:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 23:42:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 23:48:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2016-12-31 23:54:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2017-01-01 00:00:00</th>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>701281 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                     count\n",
       "2009-01-01 00:00:00    NaN\n",
       "2009-01-01 00:06:00    NaN\n",
       "2009-01-01 00:12:00    NaN\n",
       "2009-01-01 00:18:00    NaN\n",
       "2009-01-01 00:24:00    NaN\n",
       "2009-01-01 00:30:00    NaN\n",
       "2009-01-01 00:36:00    NaN\n",
       "2009-01-01 00:42:00    NaN\n",
       "2009-01-01 00:48:00    NaN\n",
       "2009-01-01 00:54:00    NaN\n",
       "2009-01-01 01:00:00    2.0\n",
       "2009-01-01 01:06:00    NaN\n",
       "2009-01-01 01:12:00    NaN\n",
       "2009-01-01 01:18:00    NaN\n",
       "2009-01-01 01:24:00    NaN\n",
       "2009-01-01 01:30:00    NaN\n",
       "2009-01-01 01:36:00    1.0\n",
       "2009-01-01 01:42:00    NaN\n",
       "2009-01-01 01:48:00    1.0\n",
       "2009-01-01 01:54:00    NaN\n",
       "2009-01-01 02:00:00    NaN\n",
       "2009-01-01 02:06:00    NaN\n",
       "2009-01-01 02:12:00    NaN\n",
       "2009-01-01 02:18:00    NaN\n",
       "2009-01-01 02:24:00    NaN\n",
       "2009-01-01 02:30:00    NaN\n",
       "2009-01-01 02:36:00    1.0\n",
       "2009-01-01 02:42:00    NaN\n",
       "2009-01-01 02:48:00    2.0\n",
       "2009-01-01 02:54:00    NaN\n",
       "2009-01-01 03:00:00    NaN\n",
       "2009-01-01 03:06:00    NaN\n",
       "2009-01-01 03:12:00    NaN\n",
       "2009-01-01 03:18:00    NaN\n",
       "2009-01-01 03:24:00    NaN\n",
       "2009-01-01 03:30:00    NaN\n",
       "2009-01-01 03:36:00    NaN\n",
       "2009-01-01 03:42:00    1.0\n",
       "2009-01-01 03:48:00    1.0\n",
       "2009-01-01 03:54:00    NaN\n",
       "2009-01-01 04:00:00    NaN\n",
       "2009-01-01 04:06:00    1.0\n",
       "2009-01-01 04:12:00    3.0\n",
       "2009-01-01 04:18:00    1.0\n",
       "2009-01-01 04:24:00    NaN\n",
       "2009-01-01 04:30:00    1.0\n",
       "2009-01-01 04:36:00    NaN\n",
       "2009-01-01 04:42:00    NaN\n",
       "2009-01-01 04:48:00    1.0\n",
       "2009-01-01 04:54:00    NaN\n",
       "...                    ...\n",
       "2016-12-31 19:06:00    NaN\n",
       "2016-12-31 19:12:00    NaN\n",
       "2016-12-31 19:18:00    NaN\n",
       "2016-12-31 19:24:00    NaN\n",
       "2016-12-31 19:30:00    NaN\n",
       "2016-12-31 19:36:00    NaN\n",
       "2016-12-31 19:42:00    NaN\n",
       "2016-12-31 19:48:00    NaN\n",
       "2016-12-31 19:54:00    NaN\n",
       "2016-12-31 20:00:00    NaN\n",
       "2016-12-31 20:06:00    NaN\n",
       "2016-12-31 20:12:00    NaN\n",
       "2016-12-31 20:18:00    NaN\n",
       "2016-12-31 20:24:00    NaN\n",
       "2016-12-31 20:30:00    NaN\n",
       "2016-12-31 20:36:00    NaN\n",
       "2016-12-31 20:42:00    NaN\n",
       "2016-12-31 20:48:00    NaN\n",
       "2016-12-31 20:54:00    NaN\n",
       "2016-12-31 21:00:00    NaN\n",
       "2016-12-31 21:06:00    NaN\n",
       "2016-12-31 21:12:00    NaN\n",
       "2016-12-31 21:18:00    NaN\n",
       "2016-12-31 21:24:00    NaN\n",
       "2016-12-31 21:30:00    NaN\n",
       "2016-12-31 21:36:00    NaN\n",
       "2016-12-31 21:42:00    NaN\n",
       "2016-12-31 21:48:00    NaN\n",
       "2016-12-31 21:54:00    NaN\n",
       "2016-12-31 22:00:00    NaN\n",
       "2016-12-31 22:06:00    NaN\n",
       "2016-12-31 22:12:00    NaN\n",
       "2016-12-31 22:18:00    NaN\n",
       "2016-12-31 22:24:00    NaN\n",
       "2016-12-31 22:30:00    NaN\n",
       "2016-12-31 22:36:00    NaN\n",
       "2016-12-31 22:42:00    NaN\n",
       "2016-12-31 22:48:00    NaN\n",
       "2016-12-31 22:54:00    NaN\n",
       "2016-12-31 23:00:00    NaN\n",
       "2016-12-31 23:06:00    NaN\n",
       "2016-12-31 23:12:00    NaN\n",
       "2016-12-31 23:18:00    NaN\n",
       "2016-12-31 23:24:00    NaN\n",
       "2016-12-31 23:30:00    NaN\n",
       "2016-12-31 23:36:00    NaN\n",
       "2016-12-31 23:42:00    NaN\n",
       "2016-12-31 23:48:00    NaN\n",
       "2016-12-31 23:54:00    NaN\n",
       "2017-01-01 00:00:00    NaN\n",
       "\n",
       "[701281 rows x 1 columns]"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "d5.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "%config InlineBackend.figure_format='retina'\n",
    "\n",
    "import dask.dataframe as dd\n",
    "import dask.distributed\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from matplotlib.colors import SymLogNorm as symlog\n",
    "from matplotlib import rcParams\n",
    "\n",
    "import sklearn\n",
    "import matplotlib.pyplot as plt\n",
    "import palettable\n",
    "\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pd.options.display.max_rows = 100\n",
    "pd.options.display.max_columns = 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "client = dask.distributed.Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "trips = dd.read_parquet('/data/all_trips.parquet', engine='fastparquet', index='pickup_datetime', \n",
    "                        columns=['pickup_datetime', 'pickup_taxizone_id', 'dropoff_datetime', 'dropoff_taxizone_id'])\n",
    "trips.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def parse_to_6min_freq(df):\n",
    "    return df.index.values.astype('M8[m]').astype(str)\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "parse_to_6min_freq(trips.get_partition(0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "trips['pickup_minute'] = trips.map_partitions(parse_to_6min_freq, meta=('pickup_minute', object))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "trips.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
